# Answers in boxes with this colour, below.
# See also inline comments with code.
# System info.
version
##                _
## platform       x86_64-w64-mingw32
## arch           x86_64
## os             mingw32
## system         x86_64, mingw32
## status
## major          3
## minor          3.2
## year           2016
## month          10
## day            31
## svn rev        71607
## language       R
## version.string R version 3.3.2 (2016-10-31)
## nickname       Sincere Pumpkin Patch
date()
## [1] "Tue Jan 10 18:18:17 2017"
# Load libraries.
require(car)
require(class)
require(colorspace)
require(cowplot)
require(GGally)
require(gbm)
require(glmnet)
require(gplots)
require(ggplot2)
require(ISLR)
require(leaps)
require(MASS)
require(randomForest)
require(tree)
# Set deliberate seed, so can reproduce results.
set.seed(100)
# Helper functions.

# This is just used to blat the workspace between questions - to help keep them self-contained.
# Less chance of a bug being caused by unwittingly using a variable from a previous question, etc.
clear_workspace <- function() {
  rm(
    list = ls(envir = .GlobalEnv)[!ls(envir = .GlobalEnv) %in% c("best_knn", "clear_workspace")],
    envir = .GlobalEnv
  )
}

# There's a lot of repetition in calls to various KNN models, primarily around looping over
# values of k and printing the confusion matrix for the best one. So this function just
# allows this to be factored out for re-use.
best_knn <- function(
  maxk,
  train.X,
  test.X,
  train.Y,
  test.Y) {

  err = vector(mode = "numeric", length = maxk)

  for (k in 1 : maxk) {
    knn.pred <- knn(
      train = train.X,
      test = test.X,
      cl = train.Y,
      k = k
    )
    err[k] <- mean(knn.pred != test.Y)
  }

  # Plot k vs err
    print(ggplot() +
    geom_point(
      alpha = 0.4,
      aes(
        x = 1:maxk,
        y = err
      )
    ) +
    ggtitle("Classification Error for different values of K") +
    labs(
      x = "k",
      y = "error"
    )
  )

  # Find the value of k which had the minimum error.
  bestk <- which.min(err)
  print(sprintf("best k = %d", bestk))

  # Now retrain the model with the best value of k.
  knn.pred <- knn(
    train = train.X,
    test = test.X,
    cl = train.Y,
    k = bestk
  )

  # Output the percentage of correct predictions for best value of k.
  print(sprintf("percentage correct = %.2f", 100*mean(knn.pred == test.Y)))

  # Output the prediction error for best value of k.
  print(sprintf("prediction error = %.4f", mean(knn.pred != test.Y)))

  # Output the confusion matrix for best value of k.
  print(table(
    "Predicted" = knn.pred,
    "Actual" = test.Y
  ))
}

Week 5

Exercise 10

This question should be answered using the Weekly data set, which is part of the ISLR package.

# Attach the dataframe, for convenience.
attach(Weekly)

(a) Produce some numerical and graphical summaries of the Weekly data. Do there appear to be any patterns?

# See below; as with the SMarket data, the only obvious relationship - from visual inspection - is between Year and Volume, which are positively correlated (r=0.84).
# Eyeball column names.
names(Weekly)
## [1] "Year"      "Lag1"      "Lag2"      "Lag3"      "Lag4"      "Lag5"
## [7] "Volume"    "Today"     "Direction"
# Check data dimensions.
dim(Weekly)
## [1] 1089    9
# Print summary statistics.
summary(Weekly)
##       Year           Lag1               Lag2               Lag3
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260
##       Lag4               Lag5              Volume
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202
##  Median :  0.2380   Median :  0.2340   Median :1.00268
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821
##      Today          Direction
##  Min.   :-18.1950   Down:484
##  1st Qu.: -1.1540   Up  :605
##  Median :  0.2410
##  Mean   :  0.1499
##  3rd Qu.:  1.4050
##  Max.   : 12.0260
# Plot correlation matrix.
ggpairs(
  data = Weekly,
  mapping = aes(
    colour = Direction,
    alpha = 0.4
  )
)

# Check correlation values (excluding Direction, since this is categorical).
cor(Weekly[,-9])
##               Year         Lag1        Lag2        Lag3         Lag4
## Year    1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1   -0.03228927  1.000000000 -0.07485305  0.05863568 -0.071273876
## Lag2   -0.03339001 -0.074853051  1.00000000 -0.07572091  0.058381535
## Lag3   -0.03000649  0.058635682 -0.07572091  1.00000000 -0.075395865
## Lag4   -0.03112792 -0.071273876  0.05838153 -0.07539587  1.000000000
## Lag5   -0.03051910 -0.008183096 -0.07249948  0.06065717 -0.075675027
## Volume  0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today  -0.03245989 -0.075031842  0.05916672 -0.07124364 -0.007825873
##                Lag5      Volume        Today
## Year   -0.030519101  0.84194162 -0.032459894
## Lag1   -0.008183096 -0.06495131 -0.075031842
## Lag2   -0.072499482 -0.08551314  0.059166717
## Lag3    0.060657175 -0.06928771 -0.071243639
## Lag4   -0.075675027 -0.06107462 -0.007825873
## Lag5    1.000000000 -0.05851741  0.011012698
## Volume -0.058517414  1.00000000 -0.033077783
## Today   0.011012698 -0.03307778  1.000000000
# Year and Volume appear correlated, plot the mean volume (plus indication of daily range) per year.
means <- aggregate(
  formula = Volume ~ Year,
  data = Weekly,
  FUN = mean
)

ggplot() +
geom_point(
  data = Weekly,
  alpha = 0.4,
  aes(
    x = Year,
    y = Volume
  )
) +
geom_line(
  data = means,
  alpha = 0.4,
  colour = "blue",
  aes(
    x = Year,
    y = Volume
  )
) +
ggtitle("Volume / Mean Volume by Year") +
labs(
  x = "Year",
  y = "Volume"
)

(b) Use the full data set to perform a logistic regression with Direction as the response and the five lag variables plus Volume as predictors. Use the summary function to print the results. Do any of the predictors appear to be statistically significant? If so, which ones?

# See below; Lag2 appears to be statistically significant (p < 0.05), with a coefficient of 0.05844.
# Fit a glm model to our data.
glm.fit <- glm(
  formula = Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,
  data = Weekly,
  family = binomial
)
# Output summary stats.
summary(glm.fit)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
##     Volume, family = binomial, data = Weekly)
##
## Deviance Residuals:
##     Min       1Q   Median       3Q      Max
## -1.6949  -1.2565   0.9913   1.0849   1.4579
##
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181
## Lag2         0.05844    0.02686   2.175   0.0296 *
## Lag3        -0.01606    0.02666  -0.602   0.5469
## Lag4        -0.02779    0.02646  -1.050   0.2937
## Lag5        -0.01447    0.02638  -0.549   0.5833
## Volume      -0.02274    0.03690  -0.616   0.5377
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4

(c) Compute the confusion matrix and overall fraction of correct predictions. Explain what the confusion matrix is telling you about the types of mistakes made by logistic regression.

# See below. The confusion matrix is telling us which predictions our model got right (true negative for Down and true positive for Up), as well as which predictions it got wrong (false negative for Down - when the actual Direction was Up - and false positive for Up, when the actual direction was Down.

In this case, the model got a total of (54+557)/1089 predictions correct, or 56.11% (increasing to 56.43% if we just take the cases where the model predicts Up). This is better than chance, although just predicting the majority class (48+557 actual instances of Up) would have resulted in 55.55% correct predictions. Of course, all of this is based on a single data set rather than a training/test split - so we have no idea how these percentages will generalise.
# Use the glm model to predict the probability that Direction is Up, given just the predictors.
glm.probs <- predict(
  object = glm.fit,
  type = "response"
)

# Translate the probabilities into categorical responses.
numrows <- nrow(Weekly)
glm.pred <- rep("Down", numrows)
glm.pred[glm.probs >.5] = "Up"

# Output the confusion matrix.
table(
  "Predicted" = glm.pred,
  "Actual" = Direction
) 
##          Actual
## Predicted Down  Up
##      Down   54  48
##      Up    430 557
# Output the fraction of correct predictions.
sprintf("fraction of correct predictions: %.4f", mean(glm.pred == Direction))
## [1] "fraction of correct predictions: 0.5611"

(d) Now fit the logistic regression model using a training data period from 1990 to 2008, with Lag2 as the only predictor. Compute the confusion matrix and the overall fraction of correct predictions for the held out data (that is, the data from 2009 and 2010).

# See below. Using only Lag2 as predictor, and with a training/test split of the data around 2009, we achieve 62.50% correct predictions (rising to 64.29% if we only consider the cases where the model predicts Down).
# Create our "binary filter" to apply to the dataframe.
train <- (Year < 2009)

# Slice out the data where the filter values are FALSE e.g. Year >= 2009. This becomes our test data set.
Weekly.test <- Weekly[!train,]
Direction.test <- Direction[!train]
numrows.test <- nrow(Weekly.test)
# Fit a glm model to the subset of the Weekly data which matches filter values of TRUE (e.g. Year < 2009).
# Note that we're only using Lag2 as predictor, as instructed.
glm.fit <- glm(
  formula = Direction~Lag2,
  data = Weekly,
  family = binomial,
  subset = train
)
# Use the glm model to predict the probability that Direction is Up, given just the predictors.
glm.probs <- predict(
  object = glm.fit,
  newdata = Weekly.test,
  type = "response"
)

# Translate the probabilities into categorical responses.
glm.pred <- rep("Down", numrows.test)
glm.pred[glm.probs >.5] = "Up"

# Output the confusion matrix.
table(
  "Predicted" = glm.pred,
  "Actual" = Direction.test
)
##          Actual
## Predicted Down Up
##      Down    9  5
##      Up     34 56
# Output the fraction of correct predictions.
sprintf("fraction of correct predictions: %.4f", mean(glm.pred == Direction.test))
## [1] "fraction of correct predictions: 0.6250"

(g) Repeat (d) using KNN with K=1

With KNN (k=1), we get 50.96% correct predictions - and a much more even split between Up and Down.
# Create test/training matricies from Lag2 column.
train.X <- as.matrix(Lag2[train])
test.X <- as.matrix(Lag2[!train])
Direction.train <- Direction[train]

knn.pred = knn(
  train = train.X,
  test = test.X,
  cl = Direction.train,
  k = 1
)

# Output the confusion matrix.
table(
  "Predicted" = knn.pred,
  "Actual" = Direction.test
)
##          Actual
## Predicted Down Up
##      Down   21 29
##      Up     22 32
# Output the fraction of correct predictions.
sprintf("fraction of correct predictions: %.4f", mean(knn.pred == Direction.test))
## [1] "fraction of correct predictions: 0.5096"

(h) Which of these methods appears to provide the best results on this data?

# Logistic regression provides better results than KNN (with K=1); 62.50% correct predictions vs 50.96%.

(i) Experiment with different combinations of predictors, including possible transformations and interactions, for each of the methods. Report the variables, method, and associated confusion matrix that appears to provide the best results on the held out data. Note that you should also experiment with values for K in the KNN classifier.

# See below; adding an interaction between Lag1 and the product of Lag1 and Lag2 to the logistic regression formula increases the percentage of correct predictions to 64.42%. Despite trying values of k from 1:100, KNN could not match this (the best value of k being 72, with 61.54% correct classifications).

Note that the question explicity asks us to evaluate which variable “provide the best results on the held out data”, even though this means that we’re implicitly tuning our model to the test data (rather than using e.g. cross-validation on the training set).

# Logistic Regression

glm.fit <- glm(
  formula = Direction~Lag2+Lag1:I(Lag1*Lag2),
  data = Weekly,
  family = binomial,
  subset = train
)

glm.probs <- predict(
  object = glm.fit,
  newdata = Weekly.test,
  type = "response"
)

glm.pred <- rep("Down", numrows.test)
glm.pred[glm.probs >.5] <- "Up"

table(
  "Predicted" = glm.pred,
  "Actual" = Direction.test
)
##          Actual
## Predicted Down Up
##      Down    8  2
##      Up     35 59
sprintf("fraction of correct predictions: %.4f", mean(glm.pred == Direction.test))
## [1] "fraction of correct predictions: 0.6442"

# KNN

best_knn(100, train.X, test.X, Direction.train, Direction.test)

## [1] "best k = 72"
## [1] "percentage correct = 61.54"
## [1] "prediction error = 0.3846"
##          Actual
## Predicted Down Up
##      Down   15 12
##      Up     28 49

Exercise 12

(a) Write a function, Power(), that prints out the result of raising 2 to the 3rd power. In other words, your function should compute 23 and print out the results.

power <- function() {
    sprintf("2 ^ 3 = %.2f", (2 ^ 3))
}
power()
## [1] "2 ^ 3 = 8.00"

(b) Create a new function, Power2(), that allows you to pass any two numbers, x and a, and prints out the value of x^a.

power2 <- function(x, a) {
    sprintf("power2(%d,%d) = %.2f", x, a, (x^a))
}

(c) Using the Power2() function that you just wrote, compute 103, 817, and 1313.

power2(10,3) 
## [1] "power2(10,3) = 1000.00"
power2(8,17) 
## [1] "power2(8,17) = 2251799813685248.00"
power2(131,3)
## [1] "power2(131,3) = 2248091.00"

(d) Now create a new function, Power3(), that actually returns the result x^a as an R object, rather than simply printing it to the screen.

power3 <- function(x, a) {
    return(x^a)
}

(e) Now using the Power3() function, create a plot of f(x)=x2. The x-axis should display a range of integers from 1 to 10, and the y-axis should display x2. Label the axes appropriately, and use an appropriate title for the figure. Consider displaying either the x-axis, the y-axis, or both on the log-scale.

x <- 1:10
xsquared <- power3(x,2)

ggplot() +
geom_line(
  alpha = 0.4,
  aes(
    x = x,
    y = xsquared
  )
) +
scale_x_log10() +
scale_y_log10() +
ggtitle("x^2 (log10 x and y axes)") +
labs(
  x = "x",
  y = "x^2"
)

(f) Create a function, PlotPower(), that allows you to create a plot of x against x^a for a fixed a and for a range of values of x. For instance, if you call PlotPower (1:10,3) then a plot should be created with an x-axis taking on values 1,2,…,10, and a y-axis taking on values 13,23,…,103.

PlotPower <- function(xrange, pow) {

 label <- paste("x^", pow, sep="")
  plotline = geom_line(
    aes(
      x = xrange,
      y = power3(xrange, pow)
    )
  )
  plotlabs = labs(
    x = "x",
    y = label
  )
  p1 <- ggplot() + plotline + ggtitle(label) + plotlabs

  label2 <- paste(label, "(log10 x-axis)", sep=" ")
  p2 <- ggplot() + plotline + ggtitle(label2) + plotlabs + scale_x_log10()

  label3 <- paste(label, "(log10 y-axis)", sep=" ")
  p3 <- ggplot() + plotline + ggtitle(label3) + plotlabs + scale_y_log10()

  label4 <- paste(label, "(log10 x and y axes)", sep=" ")
  p4 <- ggplot() + plotline + ggtitle(label4) + plotlabs + scale_x_log10() + scale_y_log10()

  plot_grid(p1, p2, p3, p4)
}
PlotPower(1:10,3)

Week 7

Finish Exercise 10 from last homework

Exercise 10

(e) Repeat (d) using LDA.

Using LDA, we (again) obtain 62.50% correct predictions - and the same confusion matrix as with logistic regression.
lda.fit <- lda(
  formula = Direction~Lag2,
  data = Weekly,
  subset = train
)

lda.pred <- predict(
  object = lda.fit,
  newdata = Weekly.test
)

lda.class <- lda.pred$class

table(
  "Predicted" = lda.class,
  "Actual" = Direction.test
)
##          Actual
## Predicted Down Up
##      Down    9  5
##      Up     34 56
sprintf("fraction of correct predictions: %.4f", mean(lda.class == Direction.test))
## [1] "fraction of correct predictions: 0.6250"

(f) Repeat (d) using QDA.

With QDA, we obtain 58.65% correct predictions (and always predict Up).
qda.fit <- qda(
  formula = Direction~Lag2,
  data = Weekly,
  subset = train
)

qda.pred <- predict(
  object = qda.fit,
  newdata = Weekly.test
)

qda.class <- qda.pred$class

table(
  "Predicted" = qda.class,
  "Actual" = Direction.test
)
##          Actual
## Predicted Down Up
##      Down    0  0
##      Up     43 61
sprintf("fraction of correct predictions: %.4f", mean(qda.class == Direction.test))
## [1] "fraction of correct predictions: 0.5865"
# Housekeeping.
clear_workspace()
detach(Weekly)

Exercise 11

(a) Create a binary variable, mpg01, that contains a 1 if mpg contains a value above its median, and a 0 if mpg contains a value below its median.

# Create a factor column for our mpg01 variable.
mpg.median <- median(Auto$mpg)
mpg01 <- as.factor(ifelse(Auto$mpg < mpg.median, 0, 1))
# Create a new version of Auto with the mpg01 column added.
Auto <- data.frame(Auto, mpg01)
# Remove temporary variables to avoid confusion due to e.g. shadowing.
rm(mpg.median, mpg01)
# Only attach our Auto frame now, to avoid confusion due to e.g. shadowing.
# Use of attach/detach is of course debatable due to these issues..
attach(Auto)
summary(Auto)
##       mpg          cylinders      displacement     horsepower
##  Min.   : 9.00   Min.   :3.000   Min.   : 68.0   Min.   : 46.0
##  1st Qu.:17.00   1st Qu.:4.000   1st Qu.:105.0   1st Qu.: 75.0
##  Median :22.75   Median :4.000   Median :151.0   Median : 93.5
##  Mean   :23.45   Mean   :5.472   Mean   :194.4   Mean   :104.5
##  3rd Qu.:29.00   3rd Qu.:8.000   3rd Qu.:275.8   3rd Qu.:126.0
##  Max.   :46.60   Max.   :8.000   Max.   :455.0   Max.   :230.0
##
##      weight      acceleration        year           origin
##  Min.   :1613   Min.   : 8.00   Min.   :70.00   Min.   :1.000
##  1st Qu.:2225   1st Qu.:13.78   1st Qu.:73.00   1st Qu.:1.000
##  Median :2804   Median :15.50   Median :76.00   Median :1.000
##  Mean   :2978   Mean   :15.54   Mean   :75.98   Mean   :1.577
##  3rd Qu.:3615   3rd Qu.:17.02   3rd Qu.:79.00   3rd Qu.:2.000
##  Max.   :5140   Max.   :24.80   Max.   :82.00   Max.   :3.000
##
##                  name     mpg01
##  amc matador       :  5   0:196
##  ford pinto        :  5   1:196
##  toyota corolla    :  5
##  amc gremlin       :  4
##  amc hornet        :  4
##  chevrolet chevette:  4
##  (Other)           :365

(b) Explore the data graphically in order to investigate the association between mpg01 and the other features. Which of the other features seem most likely to be useful in predicting mpg01? Scatterplots and boxplots may be useful tools to answer this question. Describe your findings.

All of the variables except name appear to be correlated with mpg - and by extension, mpg01 - to some extent. However, there are likely to be significant colinearities between them - with more cylinders, displacement and horsepower, for example, resulting in greater weight (e.g. a bigger engine).

VIF results from a fitted linear model confirmed this; iteratively removing predictors with VIF > 5 left only cylinders, acceleration, year and origin - with only cylinders, year and origin being significantly correlated. Of these, there is a definite suggestion of non-linearity from the scatter plot of cylinders vs mpg - and a polynomial of degree three proved significant.
# Plot correlation matrix - using ggpairs (since this automatically gives us both scatterplots and boxplots).
ggpairs(
  data = Auto[,-9],
  mapping = aes(
    colour = mpg01,
    alpha = 0.4
  )
)

# Check for colinearities.
glm.fit <- glm(
  formula = mpg01~cylinders+displacement+horsepower+weight+acceleration+year+origin,
  family = binomial,
  data = Auto
)
vif(glm.fit)
##    cylinders displacement   horsepower       weight acceleration
##     4.900837     8.852836     2.691385     4.865771     2.651544
##         year       origin
##     1.686770     1.901268
# Remove any predictors with vif > 5 and repeat until all < 5
glm.fit <- update(glm.fit, . ~ . -displacement)
vif(glm.fit)
##    cylinders   horsepower       weight acceleration         year
##     2.107526     2.673592     3.392873     2.618422     1.596734
##       origin
##     1.307502
# Summary of our final model - only cylinders, year and origin have significant p values
summary(glm.fit)
##
## Call:
## glm(formula = mpg01 ~ cylinders + horsepower + weight + acceleration +
##     year + origin, family = binomial, data = Auto)
##
## Deviance Residuals:
##     Min       1Q   Median       3Q      Max
## -2.3976  -0.1066   0.0083   0.2129   3.1958
##
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)
## (Intercept)  -1.706e+01  5.735e+00  -2.974  0.00294 **
## cylinders    -1.071e-01  2.780e-01  -0.385  0.70014
## horsepower   -4.126e-02  2.385e-02  -1.730  0.08357 .
## weight       -4.208e-03  9.541e-04  -4.410 1.03e-05 ***
## acceleration  1.365e-02  1.407e-01   0.097  0.92268
## year          4.266e-01  7.314e-02   5.833 5.44e-09 ***
## origin        4.422e-01  2.994e-01   1.477  0.13965
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
##     Null deviance: 543.43  on 391  degrees of freedom
## Residual deviance: 157.57  on 385  degrees of freedom
## AIC: 171.57
##
## Number of Fisher Scoring iterations: 8
glm.fit <- update(glm.fit, . ~ . -acceleration)

# Checking for any significant polynomials since relationship between cylinders and mpg appears non-linear
glm.fit <- update(glm.fit, . ~ . - cylinders + poly(cylinders,3))
summary(glm.fit)
##
## Call:
## glm(formula = mpg01 ~ horsepower + weight + year + origin + poly(cylinders,
##     3), family = binomial, data = Auto)
##
## Deviance Residuals:
##      Min        1Q    Median        3Q       Max
## -2.79864  -0.10001   0.00351   0.14682   2.63330
##
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)
## (Intercept)         -2.244e+01  6.062e+00  -3.701 0.000215 ***
## horsepower          -4.209e-02  1.753e-02  -2.402 0.016327 *
## weight              -4.489e-03  9.008e-04  -4.984 6.24e-07 ***
## year                 5.071e-01  8.882e-02   5.710 1.13e-08 ***
## origin               6.952e-01  3.424e-01   2.030 0.042332 *
## poly(cylinders, 3)1  1.136e+01  1.018e+01   1.116 0.264447
## poly(cylinders, 3)2  1.602e+01  4.857e+00   3.299 0.000970 ***
## poly(cylinders, 3)3  1.337e+01  3.260e+00   4.100 4.13e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
##     Null deviance: 543.43  on 391  degrees of freedom
## Residual deviance: 132.39  on 384  degrees of freedom
## AIC: 148.39
##
## Number of Fisher Scoring iterations: 8
formula <- as.formula(mpg01 ~ cylinders + I(cylinders^3) + year + origin) 

(c) Split the data into a training set and a test set.

# Am using a 50/50 split per the ISLR examples
train <- sample(nrow(Auto), nrow(Auto)/2)
Auto.train <- Auto[train,]
Auto.test <- Auto[-train,]
mpg01.test <- mpg01[-train]
numrows.test <- nrow(Auto.test)

(d) Perform LDA on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained?

# The test error for LDA, using the specified predictors, is 9.69%.
lda.fit <- lda(
  formula = formula,
  data = Auto.train
)

lda.pred <- predict(
  object = lda.fit,
  newdata = Auto.test
)

lda.class = lda.pred$class

table(
  "Predicted" = lda.class,
  "Actual" = mpg01.test
)
##          Actual
## Predicted  0  1
##         0 79  5
##         1 14 98
sprintf("test error: %.4f", mean(lda.class != mpg01.test))
## [1] "test error: 0.0969"

(e) Perform QDA on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained?

# The test error for QDA, using the specified predictors, is 8.16%.
qda.fit <- qda(
  formula = formula,
  data = Auto.train
)

qda.pred <- predict(
  object = qda.fit,
  newdata = Auto.test
)

qda.class <- qda.pred$class

table(
  "Predicted" = qda.class,
  "Actual" = mpg01.test
)
##          Actual
## Predicted   0   1
##         0  79   2
##         1  14 101
sprintf("test error: %.4f", mean(qda.class != mpg01.test))
## [1] "test error: 0.0816"

(f) Perform logistic regression on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained?

# The test error for logistic regression, using the specified predictors, is 9.18%.
glm.fit <- glm(
  formula = formula,
  data = Auto.train,
  family = binomial
)

glm.probs <- predict(
  object = glm.fit,
  newdata = Auto.test,
  type = "response"
)

glm.pred <- rep(0, numrows.test)
glm.pred[glm.probs >.5] = 1

table(
  "Predicted" = glm.pred,
  "Actual" = mpg01.test
)
##          Actual
## Predicted   0   1
##         0  75   0
##         1  18 103
sprintf("test error: %.4f", mean(glm.pred != mpg01.test))
## [1] "test error: 0.0918"

(g) Perform KNN on the training data, with several values of K, in order to predict mpg01. Use only the variables that seemed most associated with mpg01 in (b). What test errors do you obtain? Which value of K seems to perform the best on this data set?

# See plot below of k vs test error. The test error for the best k (k=8), using the specified predictors, is 8.16%. Note that lower test error rates (~6%) are possible by including more predictors in the model.
# KNN
cylinders3 = cylinders ^ 3
standardized.X <- scale(data.frame(cylinders3, Auto)[, 1:9])
predictors = c("cylinders", "cylinders3", "year", "origin")
train.X <- standardized.X[train, predictors]
test.X <- standardized.X[-train, predictors]
train.Y <- mpg01[train]
test.Y <- mpg01[-train]
best_knn(100, train.X, test.X, train.Y, test.Y)

## [1] "best k = 8"
## [1] "percentage correct = 91.84"
## [1] "prediction error = 0.0816"
##          Actual
## Predicted   0   1
##         0  78   1
##         1  15 102
clear_workspace()
detach(Auto)

Exercise 13

Using the Boston data set, fit classification models in order to predict whether a given suburb has a crime rate above or below the median. Explore logistic regression, LDA, and KNN models using various subsets of the predictors. Describe your findings.

# See below (especially in-line comments). I adopted two subsets of predictors, one based on p-values (having adjusted for VIF), the other based on best subsets. Logistic regression performed better than LDA, with the “significant predictors” model performing better (test error 9.88%) than best subset (test error of 11.46%).

However KNN (k=1) beat both parametric models, with a “significant predictors” test error of 3.95%. This is not surprising, since the data represents geographical areas and is more likely to be consistent within suburbs than between them. Especially in downtown regions (presumably observations ~375 to ~475) which may be special cases.
# As before - construct our new factor variable, add it to our dataframe, cleanup, and then attach.
crim.median <- median(Boston$crim)
crim01 <- as.factor(ifelse(Boston$crim < crim.median, 0, 1))
Boston <- data.frame(Boston, crim01)
rm(crim.median, crim01)
attach(Boston)
# Show scatterplots and boxplots.
ggpairs(
  data = Boston[,-9],
  mapping = aes(
    colour = crim01,
    alpha = 0.4
  )
)

# Check for colinearities.
glm.fit <- glm(
  formula = crim01~zn+indus+chas+nox+rm+age+dis+rad+tax+ptratio+black+lstat+medv,
  data = Boston,
  family = binomial
)
vif(glm.fit)
##       zn    indus     chas      nox       rm      age      dis      rad
## 1.872010 2.640170 1.233983 4.111013 5.734988 2.455666 3.876479 1.983825
##      tax  ptratio    black    lstat     medv
## 1.932622 2.191848 1.107171 2.670299 8.088729
# Remove any predictors with vif > 5 and repeat until all < 5
glm.fit <- update(glm.fit, . ~ . -medv)
vif(glm.fit)
##       zn    indus     chas      nox       rm      age      dis      rad
## 1.731690 2.622190 1.171490 3.701289 2.239184 1.916569 2.964372 1.872988
##      tax  ptratio    black    lstat
## 1.811987 1.512531 1.105251 2.659335
# Check which predictors are significant - leaving zn, nox, rm, dis, rad, tax, ptratio.
summary(glm.fit)
##
## Call:
## glm(formula = crim01 ~ zn + indus + chas + nox + rm + age + dis +
##     rad + tax + ptratio + black + lstat, family = binomial, data = Boston)
##
## Deviance Residuals:
##     Min       1Q   Median       3Q      Max
## -2.5155  -0.2144  -0.0008   0.0031   3.2765
##
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept) -32.394832   6.327425  -5.120 3.06e-07 ***
## zn           -0.075484   0.031780  -2.375  0.01754 *
## indus        -0.055163   0.043127  -1.279  0.20087
## chas          0.866508   0.751363   1.153  0.24881
## nox          44.493847   6.937852   6.413 1.42e-10 ***
## rm            1.034095   0.420817   2.457  0.01400 *
## age           0.007593   0.010389   0.731  0.46485
## dis           0.479160   0.190858   2.511  0.01205 *
## rad           0.643633   0.146068   4.406 1.05e-05 ***
## tax          -0.007432   0.002624  -2.833  0.00462 **
## ptratio       0.208206   0.098197   2.120  0.03398 *
## black        -0.012674   0.006695  -1.893  0.05833 .
## lstat         0.043258   0.047431   0.912  0.36176
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
##     Null deviance: 701.46  on 505  degrees of freedom
## Residual deviance: 218.82  on 493  degrees of freedom
## AIC: 244.82
##
## Number of Fisher Scoring iterations: 9
glm.fit <- update(glm.fit, . ~ zn+nox+rm+dis+rad+tax+ptratio)

# Check for non-linearity (plot residuals).

# Interestingly, plotting the residuals vs crim (not crim01) shows that things go a bit crazy around
# row 400. A plot of glm.fit flags up rows 381, 406 and 419 - which do appear to be outliers, having
# the three highest values of crim (all above 73, vs a median of 0.2565). Presumably these are city
# centre areas; they do have higher tax and lower dis values. Maybe splines would help with handling
# this(?)

ggplot() +
geom_point(
  alpha = 0.4,
  aes(
    x = 1:nrow(Boston),
    y = lm(crim ~ zn+nox+rm+dis+rad+tax+ptratio)$residuals
  )
) +
ggtitle("Residuals for crim~zn+nox+rm+dis+rad+tax+ptratio") +
labs(
  x = "observation",
  y = "residual"
)

# The residuals from the binomial model is consistent with this idea, in that there are a series
# of flat lines (presumably similar types of suburb) interspersed by small regions of variability.
# Possibly there is some kind of geographical ordering to the Boston data that results in this.

ggplot() +
geom_point(
  alpha = 0.4,
  aes(
    x = 1:nrow(Boston),
    y = glm.fit$residuals
  )
) +
scale_y_continuous(
  limits = c(0, 10),
  oob = scales::squish
) +
ggtitle("Residuals for crim01~zn+nox+rm+dis+rad+tax+ptratio") +
labs(
  x = "observation",
  y = "residual"
)

# By way of an alternative to basing the model on p-values, I'm checking best subsets.
# Note I'm using Cp as estimate of test error, on the whole data set, for convenience.
nvmax <- 13
Boston.best = regsubsets(
  x = crim01~zn+indus+chas+nox+rm+age+dis+rad+tax+ptratio+black+lstat+medv,
  data=Boston,
  nvmax=nvmax
)
best.summary = summary(Boston.best)
bestnv <- which.min(best.summary$cp)
sprintf("best nv: %d", bestnv)
## [1] "best nv: 6"
Boston.best.coef <- coef(Boston.best, bestnv)
print(names(Boston.best.coef[-1]))
## [1] "zn"    "nox"   "age"   "rad"   "black" "medv"
# Create training/test split
train <- sample(nrow(Boston), nrow(Boston)/2) # sticking with 50/50 split per ISLR examples.
Boston.train <- Boston[train,]
Boston.test <- Boston[-train,]
numrows.test <- nrow(Boston.test)
crim01.test <- crim01[-train]

significant.predictors <- as.formula(crim01 ~ zn+nox+rm+dis+rad+tax+ptratio)
best.subset <- as.formula(crim01~zn+nox+age+rad+black+medv)

# Logistic Regression - using best subset

glm.fit <- glm(
  formula = best.subset,
  data = Boston.train,
  family = binomial
)

glm.probs <- predict(
  glm.fit,
  Boston.test,
  type="response"
)

glm.pred <- rep(0, numrows.test)
glm.pred[glm.probs >.5] <- 1

table(
  "Predicted" = glm.pred,
  "Actual" = crim01.test
)
##          Actual
## Predicted   0   1
##         0 118  19
##         1  10 106
sprintf("prediction error: %.4f", mean(glm.pred != crim01.test))
## [1] "prediction error: 0.1146"

# Logistic regression - using significant predictors

glm.fit <- glm(
  formula = significant.predictors,
  data = Boston.train,
  family = binomial
)

glm.probs <- predict(
  glm.fit,
  Boston.test,
  type="response"
)

glm.pred <- rep(0, numrows.test)
glm.pred[glm.probs >.5] <- 1

table(
  "Predicted" = glm.pred,
  "Actual" = crim01.test
)
##          Actual
## Predicted   0   1
##         0 118  15
##         1  10 110
sprintf("prediction error: %.4f", mean(glm.pred != crim01.test))
## [1] "prediction error: 0.0988"

# LDA - using best subset

lda.fit <- lda(
  formula = best.subset,
  data = Boston.train
)

lda.pred <- predict(
  object = lda.fit,
  newdata = Boston.test
)

lda.class <- lda.pred$class

table(
  "Predicted" = lda.class,
  "Actual" = crim01.test
)
##          Actual
## Predicted   0   1
##         0 122  31
##         1   6  94
sprintf("prediction error: %.4f", mean(lda.class != crim01.test))
## [1] "prediction error: 0.1462"

# LDA - using significant predictors

lda.fit <- lda(
  formula = significant.predictors,
  data = Boston.train
)

lda.pred <- predict(
  object = lda.fit,
  newdata = Boston.test
)

lda.class <- lda.pred$class

table(
  "Predicted" = lda.class,
  "Actual" = crim01.test
)
##          Actual
## Predicted   0   1
##         0 118  33
##         1  10  92
sprintf("test prediction error: %.4f", mean(lda.class != crim01.test))
## [1] "test prediction error: 0.1700"

# KNN - Using best subset

maxk <- 100
standardized.X <- scale(Boston[, 1:14])
train.Y <- crim01[train]
test.Y <- crim01[-train]

predictors <- all.vars(best.subset[[3]])
train.X <- standardized.X[train, predictors]
test.X <- standardized.X[-train, predictors]
best_knn(maxk, train.X, test.X, train.Y, test.Y)

## [1] "best k = 3"
## [1] "percentage correct = 89.33"
## [1] "prediction error = 0.1067"
##          Actual
## Predicted   0   1
##         0 118  17
##         1  10 108

# KNN - Using significant predictors

predictors <- all.vars(significant.predictors[[3]])
train.X <- standardized.X[train, predictors]
test.X <- standardized.X[-train, predictors]
best_knn(maxk, train.X, test.X, train.Y, test.Y)

## [1] "best k = 1"
## [1] "percentage correct = 96.05"
## [1] "prediction error = 0.0395"
##          Actual
## Predicted   0   1
##         0 126   8
##         1   2 117

Week 9

Exercise 7

In the lab, we applied random forests to the Boston data using mtry=6 and using ntree=25 and ntree=500. Create a plot displaying the test error resulting from random forests on this data set for a more comprehensive range of values for mtry and ntree. Describe the results obtained.

# See below. As might be expected, prediction error is greater where values of mtry and ntree are low. This appears to be more pronounced for ntree than for mtry; the heatmap suggests that the highest errors result from ntree = 1, and it is not until values of ntree >= 50 that errors stablise. Whereas errors from values of mtry as low as 3 are quite similar. The minimum test mse (11.9062) was found with ntree = mtry = 7.
# Range of values for mtry (number of variables to select from).
# There are only 13 (not including our predictor, medv), so we may as well try them all.
mtry <- seq(1,13)

# Less sure what to cover in terms of range of ntree (number of trees to build). It's
# likely that there will be a bigger different between 2 trees vs 5, 10, etc, than there
# will be between 100 and 200. So I'm using a sequence of 1 to 10, in steps of 1, and then
# a sequence of 25 to 500, in steps of 25.
ntree <- c(1:10, seq(25,500,25))

# Set up a vector to contain our test errors for each combination of mtry and ntree.
err <- vector('numeric', length(mtry) * length(ntree))
index <- 1

# And now loop through the combinations, fitting a random forest model to each.
for (i in mtry) {
  for (j in ntree) {
    # fit the model to the training set
    rf.boston <- randomForest(
      formula = medv ~ . -crim01, # exclude crim01 from the previous question
      data = Boston,
      subset = train,
      mtry = i,
      ntree = j
    )
    # predict medv from the test data
    medv.rf <- predict(
      object = rf.boston,
      newdata = Boston.test
    )
    # store the error for later
    mse <- mean((medv.rf - Boston.test$medv) ^ 2)
    err[index] <- mse
    index <- index + 1
  }
}

# Now analyse the errors we recorded during the looop.
min.err.index = which.min(err)
min.err.mtry = floor(min.err.index / length(ntree))
min.err.ntree = ntree[min.err.index - (min.err.mtry * length(ntree))]
sprintf("min error %.4f found with mtry=%d, ntree=%d", err[min.err.index], min.err.mtry, min.err.ntree)
## [1] "min error 11.9062 found with mtry=7, ntree=7"
# Do a basic 2-D plot of ntree vs error, with 13 lines representing each
# of our 13 values of mtry.
colors <- rainbow(max(mtry))
lty <- c(rep(1, 8),rep(2,5))
for (i in mtry) {
  numx = 10 # just plotting the first 10 error values for each mtry, for clarity
  start <- (i-1)*numx+1
  end <- i*numx
  errs <- err[start:end]
  col <- colors[i]
  if (i == 1) {
      plot(ntree[1:numx], errs, col = col, type = "l", xlab = "ntree", ylab = "mse", ylim=c(9,65))
    }
    else {
      lines(ntree[1:numx], errs, col = col, type = "l", lty = lty[i])
    }
}
legend("topright", as.character(mtry), col = mtry+1, cex = 1, lty = lty, title="mtry")

# We have an error that varies along two dimensions, e.g. a surface.
# So a 3-D representation might be helpful. For which we need a matrix,
# not a vector.
z <- matrix(err, nrow=length(mtry), ncol=length(ntree), byrow=TRUE)

# Try the default persp plot from R
persp(
  x = mtry,
  y = ntree,
  z = z,
  theta = 45,
  phi = 45,
  zlab = "mse"
)

# Also try heatmap2 from ggplot2
heatmap.2(
  z,
  Rowv = "none",
  Colv = "none",
  dendrogram = "none",
  density.info = "none",
  trace = "none",
  labRow = mtry,
  labCol = ntree,
  xlab = "ntree",
  ylab = "mtry",
  key.xlab = "mse",
  col = diverge_hcl(length(mtry), c = 100, l = c(50, 90), power = 1)
)

clear_workspace()
detach(Boston)

Exercise 8

In the lab, a classification tree was applied to the Carseats data set after converting Sales into a qualitative response variable. Now we will seek to predict Sales using regression trees and related approaches, treating the response as a quantitative variable.

attach(Carseats)

(a) Split the data set into a training set and a test set.

# Using same 50/50 split as the book.
train <- sample(nrow(Carseats), nrow(Carseats)/2)
Carseats.test <- Carseats[-train,] 

(b) Fit a regression tree to the training set. Plot the tree, and interpret the results. What test MSE do you obtain?

# See below. The regression tree fit to the training data uses the variables ShelveLoc, Price, CompPrice, Age & Advertising, has 12 internal nodes (not including the root), 14 terminal nodes, and a residual mean deviance (sum of squared errors / (num observations - num terminal nodes)) of 2.124. The MSE against the test set was 4.7117.
Carseats.tree <- tree(
  formula = Sales~.,
  data = Carseats,
  subset = train
)

summary(Carseats.tree)
##
## Regression tree:
## tree(formula = Sales ~ ., data = Carseats, subset = train)
## Variables actually used in tree construction:
## [1] "ShelveLoc"   "Price"       "CompPrice"   "Age"         "Advertising"
## Number of terminal nodes:  14
## Residual mean deviance:  2.124 = 395 / 186
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.
## -3.22800 -0.90350  0.05843  0.00000  0.79680  3.79300
plot(Carseats.tree)
text(Carseats.tree, pretty = 0, font = 4, cex = 0.75)

Carseats.pred <- predict(
  object = Carseats.tree,
  newdata = Carseats.test)

sprintf("test mse: %.4f", mean((Carseats.pred - Carseats.test$Sales) ^ 2))
## [1] "test mse: 4.7117"

(c) Use cross-validation in order to determine the optimal level of tree complexity. Does pruning the tree improve the test MSE?

# See below. Cross-validation shows the optimal level of tree complexity to be 14; the same as the unpruned tree.
# Perform the cross-validation
Carseats.cv.tree <- cv.tree(Carseats.tree)

# Output the results.
Carseats.cv.tree
## $size
##  [1] 14 13 12 11 10  9  8  7  6  5  4  3  2  1
##
## $dev
##  [1]  808.0497  829.7067  839.8778  863.0028  871.3421 1011.5556 1025.3040
##  [8] 1030.4866 1030.4866 1066.6634 1087.5819 1133.3372 1193.5623 1471.3207
##
## $k
##  [1]      -Inf  18.02420  18.50311  19.21496  22.57085  47.82583  49.58968
##  [8]  51.36982  54.55920  83.87959  99.43388 114.56276 173.30361 285.82782
##
## $method
## [1] "deviance"
##
## attr(,"class")
## [1] "prune"         "tree.sequence"
# Determine the best tree size
sprintf("best tree size: %d",
    Carseats.cv.tree$size[which.min(Carseats.cv.tree$dev)]
)
## [1] "best tree size: 14"
# Plot deviance vs tree size
ggplot() +
geom_line(
  colour = "blue",
  alpha = 0.4,
  aes(
    x = Carseats.cv.tree$size,
    y = Carseats.cv.tree$dev
  )
) +
scale_x_continuous(
  breaks = Carseats.cv.tree$size
) +
ggtitle("Deviance vs Tree Size") +
labs(
  x = "tree size",
  y = "deviance"
)

# Pruning doesn't seem to be necessary in this case

(d) Use the bagging approach in order to analyze this data. What test MSE do you obtain? Use the importance() function to determine which variables are most important.

# See below; with bagging the test MSE was 2.8308, and the most important variables were Price & ShelveLoc, and then CompPrice, Age & Advertising.
# Fit randomForest model to the training data, using all predictors
# e.g. bagging. Specify importance = TRUE so that we can use this
# feature for later interpretation.
Carseats.bag <- randomForest(
  formula = Sales~.,
  data = Carseats,
  subset = train,
  mtry = length(Carseats)-1,
  importance = TRUE
)

# Print the model descriptives
Carseats.bag
##
## Call:
##  randomForest(formula = Sales ~ ., data = Carseats, mtry = length(Carseats) -      1, importance = TRUE, subset = train)
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 10
##
##           Mean of squared residuals: 2.56219
##                     % Var explained: 64.26
# Now predict the values of Sales in the test data.
Carseats.pred <- predict(
  object = Carseats.bag,
  newdata = Carseats.test
)

# Print/calculate the MSE from test data predictions.
sprintf("test mse: %.4f", mean((Carseats.pred - Carseats.test$Sales) ^ 2))
## [1] "test mse: 2.8308"
# See which predictors made the biggest contributions to MSE / node purity.
importance(Carseats.bag)
##                %IncMSE IncNodePurity
## CompPrice   23.6361692    143.294132
## Income       2.1541598     66.972171
## Advertising 11.9378536    101.667610
## Population   0.9711789     57.362038
## Price       56.4945990    474.718399
## ShelveLoc   45.2439143    357.441953
## Age         14.6443266    153.419118
## Education   -4.2397685     30.460911
## Urban       -0.8729681      4.965755
## US          -0.8521151      5.722876

(e) Use random forests to analyze this data. What test MSE do you obtain? Use the importance() function to determine which variables are most important. Describe the effect of m, the number of variables considered at each split, on the error rate obtained.

# See below. For 1 <= m <= 10, MSE ranged from 2.8172 to 5.4342, with an average of 3.3177. The importance function suggests that Price and ShelveLoc are the two most important predictors, followed by CompPrice, and then Age and Advertising.

MSE was monotonically decreasing with (or at least, not increasing) with m, with the lowest test MSE using the full 10 predictor variables. The test MSE was also similar to that from the bagging model. This suggests that the predictor variables were not strongly correlated, otherwise we would expect to see an improvement using randomForest (with its decorrelation of predictors). This was supported by a call to VIF against a standard linear model.
# Sales~. has 10 predictors, so we're going to build 10 trees - with each tree considering one of 1:10 variables at each split.
mtrys <- seq(1,10)
err <- vector('numeric', length(mtrys))
index <- 1
for (mtry in mtrys) {
  # Loop round fitting each tree to the training data,
  # and storing it's associated test MSE.
  Carseats.rf <- randomForest(
    formula = Sales~.,
    data = Carseats,
    subset = train,
    mtry = mtry,
    importance=FALSE # no need as we're gonna refit in a bit
  )
  Carseats.pred <- predict(
    object = Carseats.rf,
    newdata = Carseats.test
  )
  mse <- mean((Carseats.pred - Carseats.test$Sales) ^ 2)
  err[index] <- mse
  index <- index + 1
}

# Now plot the test MSE vs the number of variables considered at each split.
plot(err)

# Print some summary stats.
sprintf("mse: min=%.4f, max=%.4f, mean=%.4f", min(err), max(err), mean(err))
## [1] "mse: min=2.8172, max=5.4342, mean=3.3177"
# Find which mtry resulted in minimum test MSE.
best.mtry <- which.min(err)
sprintf("best mtry: %d", best.mtry)
## [1] "best mtry: 10"
# Refit a tree to that value of mtry, with Importance enabled.
Carseats.rf <- randomForest(
    formula = Sales~.,
    data = Carseats,
    subset = train,
    mtry = best.mtry,
    importance=TRUE
)

# And check each variable's contribution to MSE/node purity.
importance(Carseats.rf)
##                %IncMSE IncNodePurity
## CompPrice   22.2323823    139.164686
## Income       2.4258522     68.609560
## Advertising 11.1183666     93.608566
## Population   0.3951717     54.022213
## Price       56.1352408    476.750617
## ShelveLoc   49.1109358    366.511018
## Age         18.3997934    157.938858
## Education   -2.3454048     29.789933
## Urban       -0.4835734      5.476033
## US          -0.4938681      5.537742
# The best value of mtry was 10; the full number of predictors - suggesting that the
# predictors are not strongly correlated (otherwise we would have expected to see a
# difference in the RF model vs bagging). Check this with VIF.
vif(glm(formula = Sales~., data = Carseats))
##                 GVIF Df GVIF^(1/(2*Df))
## CompPrice   1.554618  1        1.246843
## Income      1.024731  1        1.012290
## Advertising 2.103136  1        1.450219
## Population  1.145534  1        1.070296
## Price       1.537068  1        1.239785
## ShelveLoc   1.033891  2        1.008367
## Age         1.021051  1        1.010471
## Education   1.026342  1        1.013086
## Urban       1.022705  1        1.011289
## US          1.980720  1        1.407380
clear_workspace()
detach(Carseats)

Exercise 9

This problem involves the OJ data set which is part of the ISLR package.

attach(OJ)

(a) Create a training set containing a random sample of 800 observations, and a test set containing the remaining observations.

train <- sample(nrow(OJ), 800)
OJ.test <- OJ[-train,] 

(b) Fit a tree to the training data, with Purchase as the response and the other variables as predictors. Use the summary() function to produce summary statistics about the tree, and describe the results obtained. What is the training error rate? How many terminal nodes does the tree have?

# See below; the tree uses 4 predictors (LoyalCH, PriceDiff, ListPriceDiff and PctDiscMM), has 7 terminal nodes,a residual mean deviance of 793 and a training error rate of 17.13%.
# Fit classification tree to the training data.
OJ.tree <- tree(
  formula = Purchase~.,
  data = OJ,
  subset = train)

# Output summary info.
summary(OJ.tree)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = OJ, subset = train)
## Variables actually used in tree construction:
## [1] "LoyalCH"     "SalePriceMM" "PriceDiff"   "DiscMM"
## Number of terminal nodes:  7
## Residual mean deviance:  0.7636 = 605.6 / 793
## Misclassification error rate: 0.1713 = 137 / 800

(c) Type in the name of the tree object in order to get a detailed text output. Pick one of the terminal nodes, and interpret the information displayed.

# See below. Node 25 is a terminal node with definition {Purchase | LoyalCH >= 0.48285, LoyalCH < 0.764572, PriceDiff < 0.145, DiscMM >= 0.47}. This node contains 16 observations with a deviance of 7.481, and predicts a Purchase value of MM - based on 6.25% (1) CH observations vs 93.75% (15) MM. This node has the lowest residual deviance of any node; other terminal nodes have a higher percentage of MM/CH (e.g. nodes 4 and 25), but a higher deviance since they contain more observations. It is also the only node on the right hand branch of the tree which predicts MM.
print(OJ.tree)
## node), split, n, deviance, yval, (yprob)
##       * denotes terminal node
##
##  1) root 800 1079.000 CH ( 0.59625 0.40375 )
##    2) LoyalCH < 0.48285 318  342.600 MM ( 0.22956 0.77044 )
##      4) LoyalCH < 0.0356415 58   10.100 MM ( 0.01724 0.98276 ) *
##      5) LoyalCH > 0.0356415 260  306.800 MM ( 0.27692 0.72308 )
##       10) SalePriceMM < 2.04 140  128.300 MM ( 0.17143 0.82857 ) *
##       11) SalePriceMM > 2.04 120  161.500 MM ( 0.40000 0.60000 ) *
##    3) LoyalCH > 0.48285 482  426.700 CH ( 0.83817 0.16183 )
##      6) LoyalCH < 0.764572 231  280.000 CH ( 0.70563 0.29437 )
##       12) PriceDiff < 0.145 87  119.700 MM ( 0.44828 0.55172 )
##         24) DiscMM < 0.47 71   98.070 CH ( 0.53521 0.46479 ) *
##         25) DiscMM > 0.47 16    7.481 MM ( 0.06250 0.93750 ) *
##       13) PriceDiff > 0.145 144  116.000 CH ( 0.86111 0.13889 ) *
##      7) LoyalCH > 0.764572 251   84.050 CH ( 0.96016 0.03984 ) *

(d) Create a plot of the tree, and interpret the results.

# See below. The variable LoyalCH appears to be the most important, as it is used both for the split at the root node and also for the split at each of the subsequent internal nodes. Also note the the entire left hand side of the tree results in predictions of MM - so the reason for having the additional LoyalCH and SalePriceMM splits must presumably be to give greater purity at the terminal nodes. Also note that almost the entire right hand branch of the tree results in predictions of CH, with only one of the four nodes predicting MM.
plot(OJ.tree)
text(OJ.tree, pretty = 0, font = 4, cex = 0.75)

(e) Predict the response on the test data, and produce a confusion matrix comparing the test labels to the predicted test labels. What is the test error rate?

# See below; test prediction error is 0.1926 ((23+29/270)).
OJ.tree.pred <- predict(
  object = OJ.tree,
  newdata = OJ.test,
  type = "class"
)

table(
  "Predicted" = OJ.tree.pred,
  "Actual" = OJ.test$Purchase
)
##          Actual
## Predicted  CH  MM
##        CH 153  29
##        MM  23  65
sprintf("prediction error: %.4f", mean(OJ.tree.pred != OJ.test$Purchase))
## [1] "prediction error: 0.1926"

(f) Apply the cv.tree() function to the training set in order to determine the optimal tree size.

OJ.cv.tree <- cv.tree(
  object = OJ.tree,
  FUN = prune.misclass)

OJ.cv.tree
## $size
## [1] 7 5 2 1
##
## $dev
## [1] 156 156 151 323
##
## $k
## [1]       -Inf   0.000000   4.666667 172.000000
##
## $method
## [1] "misclass"
##
## attr(,"class")
## [1] "prune"         "tree.sequence"

(g) Produce a plot with tree size on the x-axis and cross-validated classification error rate on the y-axis.

ggplot() +
geom_line(
  colour = "blue",
  alpha = 0.4,
  aes(
    x = OJ.cv.tree$size,
    y = OJ.cv.tree$dev
  )
) +
scale_x_continuous(
  breaks = OJ.cv.tree$size
) +
ggtitle("Deviance vs Tree Size") +
labs(
  x = "tree size",
  y = "deviance"
)

(h) Which tree size corresponds to the lowest cross-validated classification error rate?

# See below; the lowest cross-validated error rate is found at tree size 2.
OJ.cv.tree.best <- OJ.cv.tree$size[which.min(OJ.cv.tree$dev)]
sprintf("best tree size: %d", OJ.cv.tree.best)
## [1] "best tree size: 2"

(i) Produce a pruned tree corresponding to the optimal tree size obtained using cross-validation. If cross-validation does not lead to selection of a pruned tree, then create a pruned tree with five terminal nodes.

OJ.tree.prune <- prune.misclass(OJ.tree, best=OJ.cv.tree.best)

(j) Compare the training error rates between the pruned and unpruned trees. Which is higher?

# See below; training error rate on the pruned tree is 18.88% - higher than on the unpruned tree (17.13%).
summary(OJ.tree.prune)
##
## Classification tree:
## snip.tree(tree = OJ.tree, nodes = 2:3)
## Variables actually used in tree construction:
## [1] "LoyalCH"
## Number of terminal nodes:  2
## Residual mean deviance:  0.9641 = 769.4 / 798
## Misclassification error rate: 0.1888 = 151 / 800

(k) Compare the test error rates between the pruned and unpruned trees. Which is higher?

# See below; test error rate on the pruned tree is 19.63%, only slightly higher than on the unpruned tree (19.26%).
OJ.tree.pred <- predict(
  object = OJ.tree.prune,
  newdata = OJ.test,
  type = "class"
)

table(
  "Predicted" = OJ.tree.pred,
  "Actual" = OJ.test$Purchase
)
##          Actual
## Predicted  CH  MM
##        CH 155  32
##        MM  21  62
sprintf("prediction error: %.4f", mean(OJ.tree.pred != OJ.test$Purchase))
## [1] "prediction error: 0.1963"
clear_workspace()
detach(OJ)

Exercise 10

We now use boosting to predict Salary in the Hitters data set.

(a) Remove the observations for whom the salary information is unknown, and then log-transform the salaries.

Hitters <- na.omit(Hitters)
Hitters$Salary = log(Hitters$Salary)
attach(Hitters)

(b) Create a training set consisting of the first 200 observations, and a test set consisting of the remaining observations.

train <- 1:200
Hitters.train <- Hitters[train,]
Hitters.test <- Hitters[-train,] 

(c) Perform boosting on the training set with 1,000 trees for a range of values of the shrinkage parameter λ. Produce a plot with different shrinkage values on the x-axis and the corresponding training set MSE on the y-axis.

# Specify our sequence of lambda values.
# Am using 0 to 0.01 in steps of 0.001, initially, and then
# 0.015 to 0.02 in steps of 0.01.
shrinkages <- c(seq(0, 0.01, 0.001),seq(0.015,0.2,0.01))

# Set up our data structure for storing mse.
err <- vector('numeric', length(shrinkages))

# Loop over our range of lambda values recording mse against training data.
index <- 1
for (shrinkage in shrinkages) {

  Hitters.boost <- gbm(
    formula = Salary~.,
    data = Hitters.train,
    distribution = "gaussian",
    n.trees = 1000,
    shrinkage = shrinkage
  )

  Hitters.pred <- predict(
    object = Hitters.boost,
    n.trees = 1000
  )

  mse <- mean((Hitters.pred - Hitters.train$Salary) ^ 2)
  err[index] <- mse
  index <- index + 1
}

# Plot the results
ggplot() +
geom_line(
  colour = "blue",
  alpha = 0.4,
  aes(
    x = shrinkages,
    y = err
  )
) +
scale_x_continuous(
  breaks = seq(0, 0.2, 0.01)
) +
theme(
  axis.text.x = element_text(angle=90, vjust=0.1, hjust=1)
) +
ggtitle("Training MSE vs lambda") +
labs(
  x = "lambda",
  y = "err"
)

sprintf(
  "Minimum training MSE %.4f found with lambda value %.3f", min(err), shrinkages[which.min(err)]
)
## [1] "Minimum training MSE 0.0215 found with lambda value 0.195"

(d) Produce a plot with different shrinkage values on the x-axis and the corresponding test set MSE on the y-axis.

# Loop over our range of lambda values recording mse against test data.
index <- 1
for (shrinkage in shrinkages) {

  Hitters.boost <- gbm(
    formula = Salary~.,
    data = Hitters.train,
    distribution = "gaussian",
    n.trees = 1000,
    shrinkage = shrinkage
  )

  Hitters.pred <- predict(
    object = Hitters.boost,
    newdata = Hitters.test,
    n.trees = 1000
  )

  mse <- mean((Hitters.pred - Hitters.test$Salary) ^ 2)
  err[index] <- mse
  index <- index + 1
}

# Plot the results
ggplot() +
geom_line(
  colour = "blue",
  alpha = 0.4,
  aes(
    x = shrinkages,
    y = err
  )
) +
scale_x_continuous(
  breaks = seq(0, 0.2, 0.01)
) +
theme(
  axis.text.x = element_text(angle=90, vjust=0.1, hjust=1)
) +
ggtitle("Test MSE vs lambda") +
labs(
  x = "lambda",
  y = "err"
)

best.shrinkage <- shrinkages[which.min(err)]

sprintf(
  "Minimum test MSE %.4f found with lambda value %.3f", min(err), best.shrinkage
)
## [1] "Minimum test MSE 0.2508 found with lambda value 0.075"

(e) Compare the test MSE of boosting with the test MSE of the best linear regression model that you can build.

# See below. The Hitters dataset has 20 variables, of which we’re using 19 as predictors. This is amenable to analysis using best subsets, so my first choise was to use this method for model selection. However the lowest test MSE, with a subset of 8 predictors, was 0.4685 - much higher than the test MSE for the best boosted model (0.2508).

I subsequently adopted the “significant predictors” methodology, developed above, to remove high VIF predictors and keep any that remained with significant p-values. This led to a model with three predictors (Runs+Years+PutOuts), and a test MSE of 0.4888. Following this, I checked with polynomial terms (up to 6th power) for each of the three predictors, and kept any with significant p-values. This model resulted in a test MSE of 0.3001. I also tested a model including interactions between each of the variables; none of these had significant p-values.

The best regression model, purely on test MSE, then, is the one with 6th degree polynomials. However this doesn’t seem to be a very good (or even plausible) model, and it is entirely possible that the low test MSE is an artefact of the procedure.

# Best Subsets

nvmax <- 19

# Fit our best subsets model to the training data.
Hitters.best <- regsubsets(
  x = Salary~.,
  data = Hitters.train,
  nvmax = nvmax
)

# Create a model matrix representation of the test data, for use in matrix algebra below.
Hitters.test.mat <- model.matrix(
  object = Salary~.,
  data = Hitters.test
)

# Loop through best subsests of size 1:19 recording test MSE for each.
err <- vector('numeric', nvmax)
for (nv in 1:nvmax){
  # Get the coefficients for the best subset of predictors of size nv.
  coefi <- coef(Hitters.best, id=nv)
  # Predict values of Salary for the test data, using inner product.
  pred <- Hitters.test.mat[, names(coefi)]%*%coefi
  # Store the test MSE for this subset.
  err[nv] <- mean((Hitters.test$Salary - pred) ^ 2)
}

# Now find which number of predictors resulted in the lowest test MSE.
bestnv <- which.min(err)
sprintf("best nv: %d", bestnv)
## [1] "best nv: 8"
# And output the matching coefficients and MSE value.
Hitters.best.coef <- coef(Hitters.best, bestnv)
print(Hitters.best.coef)
##   (Intercept)         AtBat          Hits         Walks         Years
##  4.4722395349 -0.0032378215  0.0149552471  0.0105182255  0.0647798866
##         CRuns        CWalks     DivisionW       PutOuts
##  0.0012175332 -0.0010136188 -0.1482230049  0.0005050225
sprintf("best subsets test mse: %.4f", err[bestnv])
## [1] "best subsets test mse: 0.4685"

# Significant Predictors

# Check for colinearities. As before, start with the full model, and then remove any
# predictors in turn until all VIF values are < 5.
glm.fit <- glm(
  formula = Salary~.,
  data = Hitters.train
)

vif(glm.fit)
##      AtBat       Hits      HmRun       Runs        RBI      Walks
##  27.616607  32.188398   8.338397  16.708030  13.146944   4.064924
##      Years     CAtBat      CHits     CHmRun      CRuns       CRBI
##   9.107056 249.914822 502.336602  44.567322 169.171173 126.740678
##     CWalks     League   Division    PutOuts    Assists     Errors
##  20.222379   3.815089   1.087212   1.261217   2.582138   2.229147
##  NewLeague
##   3.823558
glm.fit = update(glm.fit, . ~ . -CHits)
vif(glm.fit)
##      AtBat       Hits      HmRun       Runs        RBI      Walks
##  25.441430  25.776755   8.046291  13.916420  13.094014   3.851500
##      Years     CAtBat     CHmRun      CRuns       CRBI     CWalks
##   8.829910 108.666041  24.613025  60.935774  81.489877  14.158997
##     League   Division    PutOuts    Assists     Errors  NewLeague
##   3.760900   1.087072   1.242772   2.534373   2.217990   3.772691
glm.fit = update(glm.fit, . ~ . -CAtBat)
vif(glm.fit)
##     AtBat      Hits     HmRun      Runs       RBI     Walks     Years
## 23.638795 25.550055  7.732099 13.514884 12.724481  3.801000  6.434961
##    CHmRun     CRuns      CRBI    CWalks    League  Division   PutOuts
## 16.402172 27.298898 49.293859 14.139714  3.760731  1.082907  1.241575
##   Assists    Errors NewLeague
##  2.446384  2.214559  3.756903
glm.fit = update(glm.fit, . ~ . -CRuns)
vif(glm.fit)
##     AtBat      Hits     HmRun      Runs       RBI     Walks     Years
## 23.156178 25.173554  7.732079 12.145104 12.346548  3.437421  6.402642
##    CHmRun      CRBI    CWalks    League  Division   PutOuts   Assists
## 14.244544 29.799855  9.483661  3.755625  1.075675  1.216958  2.437466
##    Errors NewLeague
##  2.209426  3.748700
glm.fit = update(glm.fit, . ~ . -CRBI)
vif(glm.fit)
##     AtBat      Hits     HmRun      Runs       RBI     Walks     Years
## 22.916658 24.053957  7.098632 12.097666 11.469671  3.259073  4.632125
##    CHmRun    CWalks    League  Division   PutOuts   Assists    Errors
##  5.058320  7.508453  3.754292  1.055571  1.210333  2.435806  2.204275
## NewLeague
##  3.728251
glm.fit = update(glm.fit, . ~ . -Hits)
vif(glm.fit)
##     AtBat     HmRun      Runs       RBI     Walks     Years    CHmRun
## 11.697745  6.161576  9.754482 10.004820  2.978310  4.614367  5.042801
##    CWalks    League  Division   PutOuts   Assists    Errors NewLeague
##  7.508351  3.754285  1.052612  1.202409  2.435253  2.189567  3.724498
glm.fit = update(glm.fit, . ~ . -AtBat)
vif(glm.fit)
##     HmRun      Runs       RBI     Walks     Years    CHmRun    CWalks
##  5.094346  4.491072  6.865407  2.977274  4.573588  5.033698  7.433926
##    League  Division   PutOuts   Assists    Errors NewLeague
##  3.717577  1.038009  1.150970  2.327308  2.166085  3.678469
glm.fit = update(glm.fit, . ~ . -CWalks)
vif(glm.fit)
##     HmRun      Runs       RBI     Walks     Years    CHmRun    League
##  5.092971  4.490243  6.500651  2.320274  2.579397  3.640863  3.701809
##  Division   PutOuts   Assists    Errors NewLeague
##  1.035579  1.145814  2.317760  2.164752  3.674730
glm.fit = update(glm.fit, . ~ . -RBI)
vif(glm.fit)
##     HmRun      Runs     Walks     Years    CHmRun    League  Division
##  3.071587  3.556694  2.315386  2.576624  3.576527  3.685153  1.035578
##   PutOuts   Assists    Errors NewLeague
##  1.131827  2.306302  2.143195  3.638249
# Check for significant predictors
summary(glm.fit)
##
## Call:
## glm(formula = Salary ~ HmRun + Runs + Walks + Years + CHmRun +
##     League + Division + PutOuts + Assists + Errors + NewLeague,
##     data = Hitters.train)
##
## Deviance Residuals:
##      Min        1Q    Median        3Q       Max
## -1.76046  -0.45999   0.04022   0.42359   2.87707
##
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept)  4.3136368  0.1653621  26.086  < 2e-16 ***
## HmRun       -0.0041397  0.0084161  -0.492  0.62338
## Runs         0.0135172  0.0031998   4.224 3.73e-05 ***
## Walks        0.0020382  0.0031042   0.657  0.51224
## Years        0.0757923  0.0142758   5.309 3.09e-07 ***
## CHmRun       0.0018397  0.0010145   1.813  0.07137 .
## LeagueN      0.1289025  0.1679908   0.767  0.44385
## DivisionW   -0.1544367  0.0887317  -1.740  0.08341 .
## PutOuts      0.0005661  0.0001816   3.117  0.00211 **
## Assists      0.0008619  0.0004971   1.734  0.08456 .
## Errors      -0.0106483  0.0101328  -1.051  0.29466
## NewLeagueN   0.0178625  0.1667754   0.107  0.91482
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.3797986)
##
##     Null deviance: 166.403  on 199  degrees of freedom
## Residual deviance:  71.402  on 188  degrees of freedom
## AIC: 387.58
##
## Number of Fisher Scoring iterations: 2
# Refit the model using just the significant predictors and obtain test MSE
glm.fit = update(glm.fit, . ~ Runs+Years+PutOuts)
glm.pred <- predict(object = glm.fit, newdata = Hitters.test)
err <- mean((Hitters.test$Salary - glm.pred) ^ 2)
print(sprintf("test mse: %.4f", err))
## [1] "test mse: 0.4888"

# Polynomial terms

# Carrying on with our "significant predictors", lets try some polynomial terms in the formula.
glm.fit = update(glm.fit, . ~ . + poly(Runs,6)+poly(Years,6)+poly(PutOuts,6))

# Check which predictors are now significant
summary(glm.fit)
##
## Call:
## glm(formula = Salary ~ Runs + Years + PutOuts + poly(Runs, 6) +
##     poly(Years, 6) + poly(PutOuts, 6), data = Hitters.train)
##
## Deviance Residuals:
##      Min        1Q    Median        3Q       Max
## -1.77884  -0.21780   0.01893   0.26334   1.30489
##
## Coefficients: (3 not defined because of singularities)
##                     Estimate Std. Error t value Pr(>|t|)
## (Intercept)        4.3596499  0.0940270  46.366  < 2e-16 ***
## Runs               0.0122198  0.0014202   8.604 3.62e-15 ***
## Years              0.0984254  0.0066676  14.762  < 2e-16 ***
## PutOuts            0.0006039  0.0001299   4.649 6.42e-06 ***
## poly(Runs, 6)1            NA         NA      NA       NA
## poly(Runs, 6)2     0.1950958  0.4844670   0.403 0.687643
## poly(Runs, 6)3    -1.5577288  0.4663734  -3.340 0.001017 **
## poly(Runs, 6)4     1.9045508  0.4460796   4.270 3.16e-05 ***
## poly(Runs, 6)5    -1.7152641  0.4539578  -3.778 0.000214 ***
## poly(Runs, 6)6     2.2849296  0.4793154   4.767 3.83e-06 ***
## poly(Years, 6)1           NA         NA      NA       NA
## poly(Years, 6)2   -4.6425275  0.4613967 -10.062  < 2e-16 ***
## poly(Years, 6)3    2.3755238  0.4615639   5.147 6.86e-07 ***
## poly(Years, 6)4    1.0040153  0.4525779   2.218 0.027769 *
## poly(Years, 6)5   -0.1852489  0.4493509  -0.412 0.680638
## poly(Years, 6)6    1.5334230  0.4448595   3.447 0.000705 ***
## poly(PutOuts, 6)1         NA         NA      NA       NA
## poly(PutOuts, 6)2  0.4063497  0.4526162   0.898 0.370496
## poly(PutOuts, 6)3 -0.3228979  0.4982753  -0.648 0.517786
## poly(PutOuts, 6)4  0.5559005  0.4871840   1.141 0.255357
## poly(PutOuts, 6)5  0.4037579  0.4646012   0.869 0.385975
## poly(PutOuts, 6)6  0.6958509  0.4822787   1.443 0.150794
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.1927963)
##
##     Null deviance: 166.403  on 199  degrees of freedom
## Residual deviance:  34.896  on 181  degrees of freedom
## AIC: 258.39
##
## Number of Fisher Scoring iterations: 2
# Refit the model using those predictors/terms.
glm.fit = update(glm.fit, . ~ Runs+I(Runs^3)+I(Runs^4)+I(Runs^5)+I(Runs^6)+Years+I(Years^2)+I(Years^3)+I(Years^4)+I(Years^6)+PutOuts)

# Check test MSE
glm.pred <- predict(object = glm.fit, newdata = Hitters.test)
err <- mean((Hitters.test$Salary - glm.pred) ^ 2)
print(sprintf("test mse: %.4f", err))
## [1] "test mse: 0.3001"

# Interactions

# Finally, let's keep our three main predictors - but introduces interactions between them.
glm.fit = update(glm.fit, . ~ . +Runs:Years+Runs:PutOuts+Years:PutOuts)

# Check if any interactions are significant (spoiler alert: no).
summary(glm.fit)
##
## Call:
## glm(formula = Salary ~ Runs + I(Runs^3) + I(Runs^4) + I(Runs^5) +
##     I(Runs^6) + Years + I(Years^2) + I(Years^3) + I(Years^4) +
##     I(Years^6) + PutOuts + Runs:Years + Runs:PutOuts + Years:PutOuts,
##     data = Hitters.train)
##
## Deviance Residuals:
##     Min       1Q   Median       3Q      Max
## -1.8352  -0.2219   0.0109   0.2932   1.9681
##
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)
## (Intercept)    4.957e+00  4.788e-01  10.352  < 2e-16 ***
## Runs          -7.914e-02  2.325e-02  -3.404 0.000814 ***
## I(Runs^3)      8.254e-05  2.423e-05   3.406 0.000809 ***
## I(Runs^4)     -1.787e-06  5.702e-07  -3.134 0.002009 **
## I(Runs^5)      1.444e-08  5.012e-09   2.881 0.004430 **
## I(Runs^6)     -4.091e-11  1.536e-11  -2.664 0.008404 **
## Years          3.610e-01  2.232e-01   1.617 0.107487
## I(Years^2)     1.246e-04  5.480e-02   0.002 0.998189
## I(Years^3)    -2.334e-03  5.340e-03  -0.437 0.662644
## I(Years^4)     9.025e-05  1.924e-04   0.469 0.639488
## I(Years^6)    -2.253e-08  8.722e-08  -0.258 0.796480
## PutOuts        4.082e-04  4.864e-04   0.839 0.402424
## Runs:Years     4.407e-04  3.383e-04   1.303 0.194334
## Runs:PutOuts   1.189e-06  5.750e-06   0.207 0.836460
## Years:PutOuts  3.832e-06  3.081e-05   0.124 0.901140
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.2521698)
##
##     Null deviance: 166.403  on 199  degrees of freedom
## Residual deviance:  46.651  on 185  degrees of freedom
## AIC: 308.45
##
## Number of Fisher Scoring iterations: 2

(f) Which variables appear to be the most important predictors in the boosted model?

# See below. In the boosted model, the most important predictor appears to be CAtBat, by some margin; followed by PutOuts, CWalks, Walks and CRBI.
# Fit a boosted regression tree using the best lambda value we found earlier.
Hitters.boost <- gbm(
  formula = Salary~.,
  data = Hitters.train,
  distribution = "gaussian",
  n.trees = 1000,
  shrinkage = best.shrinkage
)

# Output the summary (relative influence of variables).
summary(Hitters.boost, las = 2, xlim = c(0,26))

(g) Now apply bagging to the training set. What is the test set MSE for this approach?

# See below; with bagging, the test set MSE is 0.2284 - the lowest of all the models tested.
Hitters.bag <- randomForest(
  formula = Salary~.,
  data = Hitters.train,
  mtry = length(Hitters)-1
)

Hitters.pred <- predict(
  object = Hitters.bag,
  newdata = Hitters.test
)

sprintf("test mse: %.4f", mean((Hitters.pred - Hitters.test$Salary) ^ 2))
## [1] "test mse: 0.2284"
detach(Hitters)
clear_workspace()

Exercise 11

This question uses the Caravan data set.

# Creating a dummy variable, since otherwise get the following error in gbm fit:
# "Error in gbm.fit Bernoulli requires the response to be in {0,1}". Alternative
# would be to use e.g. unclass(Purchase)-1 ~ 1 as the formula, but for the sake of clarity..
PurchaseY <- ifelse(Caravan$Purchase == 'Yes', 1, 0)
Caravan <- data.frame(Caravan, PurchaseY)
rm(PurchaseY)
attach(Caravan)

(a) Create a training set consisting of the first 1,000 observations, and a test set consisting of the remaining observations.

train <- 1:1000
Caravan.train <- Caravan[train,]
Caravan.test <- Caravan[-train,]
Caravan.test.numrows <- nrow(Caravan.test)

(b) Fit a boosting model to the training set with Purchase as the response and the other variables as predictors. Use 1,000 trees, and a shrinkage value of 0.01. Which predictors appear to be the most important?

# See below; PPERSAUT is the most important variable, followed by MKOOPKLA and MOPLHOOG.
# Fit a boosted classification tree.
# Note that we get some warnings about variables having no variation;
# presumably e.g. AVRAAUT is all one value *in our training split*
# We could try another split, but this would likely bias the results -
# plus the instructions explicitly state that the first 1000 observations
# should be the training set.
Caravan.boost <- gbm(
  formula = PurchaseY~.-Purchase,
  data = Caravan.train,
  distribution = "bernoulli", # as response is categorical
  n.trees = 1000,
  shrinkage = 0.01
)
## Warning in gbm.fit(x, y, offset = offset, distribution = distribution, w =
## w, : variable 50: PVRAAUT has no variation.
## Warning in gbm.fit(x, y, offset = offset, distribution = distribution, w =
## w, : variable 71: AVRAAUT has no variation.
# Output the summary (relative influence of variables).
summary(Caravan.boost, plotit=FALSE)

(c) Use the boosting model to predict the response on the test data. Predict that a person will make a purchase if the estimated probability of purchase is greater than 20%. Form a confusion matrix. What fraction of the people predicted to make a purchase do in fact make one? How does this compare with the results obtained from applying KNN or logistic regression to this data set?

# See below. With the boosting model and a threshold of 20% for estimated probability of Purchase, 20% of Yes predictions are correct (31 out of a total 124+31 Yes predictions). This compares with 14.2% using logistic regression. The best KNN model (k=11) scored 0% - since it did not predict any of the 289 actual purchases. However this is because the model was designed to minimise classification error overall; it had 5.99% classification error (although note that this is because it always predicted No - which could also have been achieved by just predicting the majority class). The best KNN model designed to minimise false positives (k=1) still only resulted in 11.53% correct Yes predictions.

# Boosting

Caravan.pred <- predict(
  object = Caravan.boost,
  newdata = Caravan.test,
  n.trees = 1000,
  type = "response"
)

Caravan.pred.purchase <- rep("No", Caravan.test.numrows)
Caravan.pred.purchase[Caravan.pred > .2] <- "Yes"
# output the confusion matrix
matrix = table(
  "Predicted" = Caravan.pred.purchase,
  "Actual" = Caravan.test$Purchase
)
print(matrix)
##          Actual
## Predicted   No  Yes
##       No  4409  258
##       Yes  124   31
# output the fraction of predicted purchasers, who actually purchase
sprintf("fraction of Yes predictions which were correct: %.4f",
    matrix[2,2] / (matrix[2,1] + matrix[2,2]))
## [1] "fraction of Yes predictions which were correct: 0.2000"

# Logistic Regression

# Fit the model.
# Note that we get some warnings about "fitted probabilities numerically 0 or 1 occurred";
# again, presumably Purchase is always either Yes or No with respect to one or more of our
# predictor variables - resulting in them being given unreasonably large weightings (per
# discussion at https://www.r-bloggers.com/learn-logistic-regression-and-beyond).
Caravan.glm.fit <- glm(
  # not using bernoulli, so can use regular Purchase variable as predictor.
  formula = Purchase ~ . -PurchaseY,
  data = Caravan.train,
  family = binomial,
  subset = train
) 
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Per the warnings, we should expect to see that certain variables have been assigned
# much larger coeffificients than the others. ABESAUT, for example (173.037773070).
coef(Caravan.glm.fit)
##   (Intercept)       MOSTYPE      MAANTHUI       MGEMOMV      MGEMLEEF
## 256.111613608  -0.018137836   0.071310327  -0.929793800   0.041869025
##      MOSHOOFD        MGODRK        MGODPR        MGODOV        MGODGE
##   0.139981328  -0.723032025  -0.320095654  -0.649867202  -0.277324784
##        MRELGE        MRELSA        MRELOV      MFALLEEN      MFGEKIND
##   0.798945489   0.691674967   0.876189582  -0.370222996  -0.298613883
##      MFWEKIND      MOPLHOOG      MOPLMIDD      MOPLLAAG      MBERHOOG
##  -0.053447705  -0.434233684  -0.693582737  -0.511762343   0.108009835
##      MBERZELF      MBERBOER      MBERMIDD      MBERARBG      MBERARBO
##  -0.273724128  -0.578837395   0.419310171   0.278734998   0.351076566
##          MSKA         MSKB1         MSKB2          MSKC          MSKD
##   0.359087294   0.143421675   0.178304030   0.109316099  -0.386861444
##        MHHUUR        MHKOOP         MAUT1         MAUT2         MAUT0
## -15.683220360 -15.614447804   0.423286795   0.430429974   0.225560470
##       MZFONDS        MZPART       MINKM30      MINK3045      MINK4575
## -13.758701896 -13.768602205   0.112284354   0.092553590   0.260611332
##      MINK7512      MINK123M       MINKGEM      MKOOPKLA       PWAPART
##   0.360128618  -0.140712551  -0.364332347   0.232451203   0.934257620
##       PWABEDR       PWALAND      PPERSAUT       PBESAUT       PMOTSCO
##  -0.905582577 -17.518973254   0.375731995 -37.923562822   0.222956221
##       PVRAAUT      PAANHANG      PTRACTOR        PWERKT         PBROM
##            NA  15.732379673  -0.332629103  16.761744979  -0.109445550
##        PLEVEN      PPERSONG       PGEZONG       PWAOREG        PBRAND
##   0.240508000   1.268522655  19.997061590  14.524192278   0.033690501
##       PZEILPL      PPLEZIER        PFIETS       PINBOED      PBYSTAND
##  37.381664743   0.637109636  21.014603746   0.569945986   0.416706179
##       AWAPART       AWABEDR       AWALAND      APERSAUT       ABESAUT
##  -1.474801231 -14.172998132  54.155870772  -0.657163782 173.037773070
##       AMOTSCO       AVRAAUT      AAANHANG      ATRACTOR        AWERKT
##  -0.114337560            NA -30.987605267 -15.450929240            NA
##         ABROM        ALEVEN      APERSONG       AGEZONG       AWAOREG
##  -0.251095198  -0.860027186 -18.787781645 -57.625956606 -31.901920342
##        ABRAND       AZEILPL      APLEZIER        AFIETS       AINBOED
##   0.572476509            NA   1.083398968 -19.285105277 -19.789292840
##      ABYSTAND
##  -0.005834623
high.coef = na.omit(coef(Caravan.glm.fit)[coef(Caravan.glm.fit) > 30])
high.coef[2:length(high.coef)]
##   PZEILPL   AWALAND   ABESAUT
##  37.38166  54.15587 173.03777
# Make predictions (probabilities) on the test data.
Caravan.pred <- predict(
  object = Caravan.glm.fit,
  newdata = Caravan.test,
  type = "response"
)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
# Convert our probabilities into No/Yes responses, with 20% threshold.
Caravan.pred.purchase <- rep("No", Caravan.test.numrows)
Caravan.pred.purchase[Caravan.pred > .2] <- "Yes"

# output the confusion matrix
matrix = table(
  "Predicted" = Caravan.pred.purchase,
  "Actual" = Caravan.test$Purchase
)
print(matrix)
##          Actual
## Predicted   No  Yes
##       No  4183  231
##       Yes  350   58
# output the fraction of predicted purchasers, who actually purchase
sprintf("fraction of Yes predictions which were correct: %.4f",
  matrix[2,2] / (matrix[2,1] + matrix[2,2]))
## [1] "fraction of Yes predictions which were correct: 0.1422"

# KNN

# Eyeballing the data, it's possible that we don't need to standardize the values -
# since they all seem to be in the same range already. However just to be sure..
maxk <- 100
standardized.X <- scale(Caravan[, -c(86,87)])
train.X <- standardized.X[train,]
test.X <- standardized.X[-train,]
train.Y <- Caravan.train$Purchase
test.Y <- Caravan.test$Purchase
best_knn(maxk, train.X, test.X, train.Y, test.Y)

## [1] "best k = 11"
## [1] "percentage correct = 94.01"
## [1] "prediction error = 0.0599"
##          Actual
## Predicted   No  Yes
##       No  4533  289
##       Yes    0    0
# OK, KNN predicted zero of the 289 actual Purchases. This is probably because the
# error being minimised is "knn.pred != test.Y" e.g. classification errors overall,
# rather than just false positives. Let's try again, only this time minimise the
# number of false positives.
err = vector(mode = "numeric", length = )

for (k in 1 : maxk) {
  knn.pred <- knn(
    train = train.X,
    test = test.X,
    cl = train.Y,
    k = k
  )
  pred.y <- knn.pred[knn.pred == 'Yes']
  err[k] <- mean(knn.pred[pred.y] != test.Y[pred.y])
}

# Find the value of k which had the minimum error.
bestk <- which.min(err)
sprintf("best k = %d", bestk)
## [1] "best k = 1"
# Now retrain the model with the best value of k.
knn.pred <- knn(
  train = train.X,
  test = test.X,
  cl = train.Y,
  k = bestk
)

# output the confusion matrix
matrix = table(
  "Predicted" = knn.pred,
  "Actual" = test.Y
)
print(matrix)
##          Actual
## Predicted   No  Yes
##       No  4249  252
##       Yes  284   37
# output the fraction of predicted purchasers, who actually purchase
print(matrix[2,2] / (matrix[2,1] + matrix[2,2]))
## [1] 0.1152648
clear_workspace()
detach(Caravan)

Week 10

Build a Ridge and a Lasso model for the problem considered in Excercise 10 from Chapter 8 of Tibshirani’s ISLR book, with particular values of the models’ parameters chosen by you. Then tune/optimise the models with cross validation. Compare your optimised Ridge and Lasso models with the models you developed in point (e) of the above mentioned exercise, and choose the best one justifying your choice.

# See below. The model with the lowest test MSE (0.3001) from (e) above - the model with 6th order polynomial terms - resulted in both Lasso and Ridge models having a higher test MSE (0.3872 and 0.4069 respectively). This seems surprising, however given that both Ridge and Lasso punish complexity, and given that the success of this model on the test data is likely have been due to its complexity, then perhaps it is an indicator that this is not a good model.

Alternatively, using all predictors, both Lasso and Ridge had marginally lower test MSE than the best subsets model from (e) above; 0.4714 and 0.4554, respectively - vs the earlier 0.4888.

In terms of choosing one of the three as the best linear model, then irrespective of the lowest test MSE I would pick ridge. The model is already reasonably sparse, reducing the value of lasso over it; as can be seen from the bar chart of Ridge coefficients (using all predictors), it arrives at a similar number of “main” variables as the other two. Best subsets may sound like a better alternative (since we have a low enough number of predictors to allow a brute-force approach) however I suspect that ridge - being by design a guard against over-fitting - might cope better with unseen data, if we really did have a test (as opposed to a validation) set.

# Using model with best test MSE from earlier

# Once again remove NAs and log-transform Salary.  Note we don't need to set a seed here,
# e.g. to re-create the state of our data from up the page, since we're not using any randomness.
Hitters <- na.omit(Hitters)
Hitters$Salary = log(Hitters$Salary)
attach(Hitters)

# Create a model matrix of our predictors (using the best combination of predictors/polynomials/interactions
# found in (e) above.
x <- model.matrix(Salary ~ Runs+I(Runs^3)+I(Runs^4)+I(Runs^5)+I(Runs^6)+Years+I(Years^2)+I(Years^3)+I(Years^4)+I(Years^6)+PutOuts)[,-1]

# Set up our training/test split, with the training set being the first 200 observations.
train <- 1:200
y <- Hitters$Salary
x.train <- x[train,]
x.test <- x[-train,]
y.train <- y[train]
y.test <- y[-train]

# Ridge Regression

# Use cross-validation to find the best value of lambda (for ridge regression) given our training set.
Hitters.cv.ridge <- cv.glmnet(
  x = x.train,
  y = y.train,
  alpha = 0 # specify ridge regression
)

Hitters.ridge.bestlam <- Hitters.cv.ridge$lambda.min
sprintf("Ridge best lambda: %.4f", Hitters.ridge.bestlam)
## [1] "Ridge best lambda: 0.0543"
plot(Hitters.cv.ridge) 

# Predict salaries based on this model.
Hitters.ridge <- glmnet(
  x = x.train,
  y = y.train,
  alpha = 0
)

Hitters.pred <- predict(
  object = Hitters.ridge,
  s = Hitters.ridge.bestlam,
  newx = x.test
)

# Calculate the test MSE.
sprintf("Ridge test MSE: %.4f", mean((Hitters.pred - y.test) ^ 2))
## [1] "Ridge test MSE: 0.4069"
# Inspect the coefficients from our best ridge model.
coefs <- predict(
  object = Hitters.ridge,
  type = "coefficients",
  s = Hitters.ridge.bestlam
)[2:12,]

# Output the coefficients as a data frame.
coefs.frame <- data.frame("var" = names(data.frame(x)), "coef" = coefs)
coefs.frame[order(coefs.frame$var),]
# Also plot the coefficients as a bar chart.
print(
   ggplot(
    data = coefs.frame,
    aes(
      x = var,
      y = coef
    )
  ) +
  geom_bar(
    stat="identity"
  ) +
  theme(
    axis.text.x = element_text(angle=90, vjust=0.1, hjust=1)
  )
)

# Lasso

Hitters.cv.lasso <- cv.glmnet(
  x = x.train,
  y = y.train,
  alpha = 1 # lasso
)

Hitters.lasso.bestlam <- Hitters.cv.lasso$lambda.min
sprintf("Lasso best lambda: %.4f", Hitters.lasso.bestlam)
## [1] "Lasso best lambda: 0.0033"
plot(Hitters.cv.lasso) 

# Predict salaries based on this model.
Hitters.lasso <- glmnet(
  x = x.train,
  y = y.train,
  alpha = 1
)

Hitters.pred <- predict(
  object = Hitters.lasso,
  s = Hitters.lasso.bestlam,
  newx = x.test
)

# Calculate the test MSE.
sprintf("Lasso test MSE: %.4f", mean((Hitters.pred - y.test) ^ 2))
## [1] "Lasso test MSE: 0.3872"
# Retrieve the coefficients.
coefs <- predict(
    object = Hitters.lasso,
    type = "coefficients",
    s = Hitters.lasso.bestlam
)[2:12,]

# Output those coefficients which haven't been shrunk to zero.
coefs[abs(coefs) > 0]
##          Runs     I(Runs^3)     I(Runs^6)         Years    I(Years^2)
##  1.270983e-02  7.462048e-08 -5.429968e-14  2.870710e-01 -8.371021e-07
##    I(Years^3)    I(Years^6)       PutOuts
## -7.985313e-04  3.410067e-08  5.717087e-04

# Using all predictors

x <- model.matrix(Salary ~ ., data = Hitters)[,-1]
train <- 1:200
y <- Hitters$Salary
x.train <- x[train,]
x.test <- x[-train,]
y.train <- y[train]
y.test <- y[-train]

# Ridge Regression

# Use cross-validation to find the best value of lambda (for ridge regression) given our training set.
Hitters.cv.ridge <- cv.glmnet(
  x = x.train,
  y = y.train,
  alpha = 0 # specify ridge regression
)

Hitters.ridge.bestlam <- Hitters.cv.ridge$lambda.min
sprintf("Ridge best lambda: %.4f", Hitters.ridge.bestlam)
## [1] "Ridge best lambda: 0.1336"
# Predict salaries based on this model.
Hitters.ridge <- glmnet(
  x = x.train,
  y = y.train,
  alpha = 0
)

Hitters.pred <- predict(
  object = Hitters.ridge,
  s = Hitters.ridge.bestlam,
  newx = x.test
)

# Calculate the test MSE.
sprintf("Ridge test MSE: %.4f", mean((Hitters.pred - y.test) ^ 2))
## [1] "Ridge test MSE: 0.4554"
# Inspect the coefficients from our best ridge model.
coefs <- predict(
  object = Hitters.ridge,
  type = "coefficients",
  s = Hitters.ridge.bestlam
)[2:20,]

# Output the coefficients as a data frame.
coefs.frame <- data.frame("var" = names(data.frame(x)), "coef" = coefs)
coefs.frame[order(coefs.frame$var),]
# Also plot the coefficients as a bar chart.
print(
   ggplot(
    data = coefs.frame,
    aes(
      x = var,
      y = coef
    )
  ) +
  geom_bar(
    stat="identity"
  ) +
  theme(
    axis.text.x = element_text(angle=90, vjust=0.1, hjust=1)
  )
)

# Lasso - Using model with best test MSE from earlier

# Now use cross-validation to find the best value of lambda - for lasso - given our training data.
Hitters.cv.lasso <- cv.glmnet(
  x = x.train,
  y = y.train,
  alpha = 1 # lasso
)

Hitters.lasso.bestlam <- Hitters.cv.lasso$lambda.min
sprintf("Lasso best lambda: %.4f", Hitters.lasso.bestlam)
## [1] "Lasso best lambda: 0.0029"
# Predict salaries based on this model.
Hitters.lasso <- glmnet(
  x = x.train,
  y = y.train,
  alpha = 1
)

Hitters.pred <- predict(
  object = Hitters.lasso,
  s = Hitters.lasso.bestlam,
  newx = x.test
)

# Calculate the test MSE.
sprintf("Lasso test MSE: %.4f", mean((Hitters.pred - y.test) ^ 2))
## [1] "Lasso test MSE: 0.4714"
# Retrieve the coefficients.
coefs <- predict(
    object = Hitters.lasso,
    type = "coefficients",
    s = Hitters.lasso.bestlam
)[2:20,]

# Output those coefficients which haven't been shrunk to zero.
coefs[abs(coefs) > 0]
##         AtBat          Hits         HmRun          Runs           RBI
## -0.0028300893  0.0133732790  0.0029395173  0.0011132052 -0.0005648055
##         Walks         Years        CHmRun         CRuns        CWalks
##  0.0085259992  0.0626866539  0.0014680812  0.0008631622 -0.0008873527
##       LeagueN     DivisionW       PutOuts       Assists        Errors
##  0.1222336365 -0.1441197451  0.0005032325  0.0007726510 -0.0093837816
##    NewLeagueN
## -0.0111156444